package com.kd.crawler.parser;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.kd.utils.*;



public class WeiyuDetailPageAnalyse {
	public static String removeHtmlTag(String inputString) {
		
			if (inputString == null)
		
			return null;
		
			String htmlStr = inputString; // 含html标签的字符串
		
			String textStr = "http://news.163.com/16/1219/10/C8L1M7EV000187VE.html";
		
			java.util.regex.Pattern p_script;
		
			java.util.regex.Matcher m_script;
		
			java.util.regex.Pattern p_style;
	
			java.util.regex.Matcher m_style;
		
			java.util.regex.Pattern p_html;
		
			java.util.regex.Matcher m_html;
		
			java.util.regex.Pattern p_special;
		
			java.util.regex.Matcher m_special;
	 
	try {
	
			//定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script>
	
			String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>";
	
			//定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style>
		
			String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>";
	
			// 定义HTML标签的正则表达式
	
			String regEx_html = "<[^>]+>";
	
			// 定义一些特殊字符的正则表达式 如：&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
	
			String regEx_special = "\\&[a-zA-Z]{1,10};";
	
			 
	
			p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
	
			m_script = p_script.matcher(htmlStr);
	
			htmlStr = m_script.replaceAll(""); // 过滤script标签
	
			p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
	
			m_style = p_style.matcher(htmlStr);
	
			htmlStr = m_style.replaceAll(""); // 过滤style标签

			p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
	
			m_html = p_html.matcher(htmlStr);
	
			htmlStr = m_html.replaceAll(""); // 过滤html标签
	
			p_special = Pattern.compile(regEx_special, Pattern.CASE_INSENSITIVE);
	
			m_special = p_special.matcher(htmlStr);
	
			htmlStr = m_special.replaceAll(""); // 过滤特殊标签
	
			textStr = htmlStr;

			} catch (Exception e) {
	
			e.printStackTrace();
	
			}
	
			return textStr;// 返回文本字符串
	
			
      
//    public static String getTextFromHtml(String htmlStr){  
//        htmlStr = delHTMLTag(htmlStr);  
//        htmlStr = htmlStr.replaceAll("&nbsp;", "");  
//        htmlStr = htmlStr.substring(0, htmlStr.indexOf("。")+1);  
//        return htmlStr;  
//    }  
      
//    public static void main(String[] args) {  
//        String str = "<div style='text-align:center;'> 整治“四风” "
//        		+ "  清弊除垢<br/><span style='font-size:14px;'> "
//        		+ "</span><span style='font-size:18px;'>公司召开党的群众路线教育实践活动动员大会</span><br/></div>";  
//        System.out.println(getTextFromHtml(str));  
//    }  

    
			 public static void main(String[] args) {
    	
    		StringBuffer sb = new StringBuffer();
    	
    		try {
    
    		FileReader fr = new FileReader("D:/test1.text");
    
    		BufferedReader br = new BufferedReader(fr);
    	
    		String s = "http://news.163.com/16/1219/10/C8L1M7EV000187VE.html";
    
    		while((s = br.readLine())!=null){
    
    		sb.append(s);
    
    		}
    
    		} catch (FileNotFoundException e) {
    
    		// TODO Auto-generated catch block
    
    		e.printStackTrace();
    
    		} catch (IOException e) {
    	
    		// TODO Auto-generated catch block
    
    		e.printStackTrace();
    	
    		}
    
//    		String ssss = ReduceHtml2Text.removeHtmlTag(sb.toString());
//    
//    		System.out.println(ssss);
    		String html = sb.toString();
    		System.out.println(HtmlCleanerUtils.getTextFromHtml(html));
    	
    		}
	    	
	    		}
	    		}


